@@ -56,43 +56,42 @@ struct gpt_params {
56
56
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
57
57
58
58
int32_t n_threads = cpu_get_num_math();
59
- int32_t n_threads_draft = -1 ;
60
- int32_t n_threads_batch = -1 ; // number of threads to use for batch processing (-1 = use n_threads)
61
- int32_t n_threads_batch_draft = -1 ;
62
- int32_t n_predict = -1 ; // new tokens to predict
63
- int32_t n_ctx = 0 ; // context size
64
- int32_t n_batch = 2048 ; // logical batch size for prompt processing (must be >=32 to use BLAS)
65
- int32_t n_ubatch = 512 ; // physical batch size for prompt processing (must be >=32 to use BLAS)
66
- int32_t n_keep = 0 ; // number of tokens to keep from initial prompt
67
- int32_t n_draft = 5 ; // number of tokens to draft during speculative decoding
68
- int32_t n_chunks = -1 ; // max number of chunks to process (-1 = unlimited)
69
- int32_t n_parallel = 1 ; // number of parallel sequences to decode
70
- int32_t n_sequences = 1 ; // number of sequences to decode
71
- float p_split = 0 .1f ; // speculative decoding split probability
72
- int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM (-1 - use default)
73
- int32_t n_gpu_layers_draft = -1 ; // number of layers to store in VRAM for the draft model (-1 - use default)
74
- llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
75
- int32_t main_gpu = 0 ; // the GPU that is used for scratch and small tensors
76
- float tensor_split[128 ] = {0 }; // how split tensors should be distributed across GPUs
77
- int32_t n_beams = 0 ; // if non-zero then use beam search of given width.
78
- int32_t grp_attn_n = 1 ; // group-attention factor
79
- int32_t grp_attn_w = 512 ; // group-attention width
80
- int32_t n_print = -1 ; // print token count every n tokens (-1 = disabled)
81
- float rope_freq_base = 0 .0f ; // RoPE base frequency
82
- float rope_freq_scale = 0 .0f ; // RoPE frequency scaling factor
59
+ int32_t n_threads_draft = -1 ;
60
+ int32_t n_threads_batch = -1 ; // number of threads to use for batch processing (-1 = use n_threads)
61
+ int32_t n_threads_batch_draft = -1 ;
62
+ int32_t n_predict = -1 ; // new tokens to predict
63
+ int32_t n_ctx = 0 ; // context size
64
+ int32_t n_batch = 2048 ; // logical batch size for prompt processing (must be >=32 to use BLAS)
65
+ int32_t n_ubatch = 512 ; // physical batch size for prompt processing (must be >=32 to use BLAS)
66
+ int32_t n_keep = 0 ; // number of tokens to keep from initial prompt
67
+ int32_t n_draft = 5 ; // number of tokens to draft during speculative decoding
68
+ int32_t n_chunks = -1 ; // max number of chunks to process (-1 = unlimited)
69
+ int32_t n_parallel = 1 ; // number of parallel sequences to decode
70
+ int32_t n_sequences = 1 ; // number of sequences to decode
71
+ float p_split = 0 .1f ; // speculative decoding split probability
72
+ int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM (-1 - use default)
73
+ int32_t n_gpu_layers_draft = -1 ; // number of layers to store in VRAM for the draft model (-1 - use default)
74
+ int32_t main_gpu = 0 ; // the GPU that is used for scratch and small tensors
75
+ float tensor_split[128 ] = {0 }; // how split tensors should be distributed across GPUs
76
+ int32_t n_beams = 0 ; // if non-zero then use beam search of given width.
77
+ int32_t grp_attn_n = 1 ; // group-attention factor
78
+ int32_t grp_attn_w = 512 ; // group-attention width
79
+ int32_t n_print = -1 ; // print token count every n tokens (-1 = disabled)
80
+ float rope_freq_base = 0 .0f ; // RoPE base frequency
81
+ float rope_freq_scale = 0 .0f ; // RoPE frequency scaling factor
83
82
float yarn_ext_factor = -1 .0f ; // YaRN extrapolation mix factor
84
- float yarn_attn_factor = 1 .0f ; // YaRN magnitude scaling factor
83
+ float yarn_attn_factor = 1 .0f ; // YaRN magnitude scaling factor
85
84
float yarn_beta_fast = 32 .0f ; // YaRN low correction dim
86
- float yarn_beta_slow = 1 .0f ; // YaRN high correction dim
87
- int32_t yarn_orig_ctx = 0 ; // YaRN original context length
85
+ float yarn_beta_slow = 1 .0f ; // YaRN high correction dim
86
+ int32_t yarn_orig_ctx = 0 ; // YaRN original context length
88
87
float defrag_thold = -1 .0f ; // KV cache defragmentation threshold
89
- std::string rpc_servers = " " ; // comma separated list of RPC servers
90
88
91
89
ggml_backend_sched_eval_callback cb_eval = nullptr ;
92
90
void * cb_eval_user_data = nullptr ;
93
91
94
92
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
95
93
94
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
96
95
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
97
96
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
98
97
@@ -114,7 +113,9 @@ struct gpt_params {
114
113
std::string lookup_cache_static = " " ; // path of static ngram cache file for lookup decoding
115
114
std::string lookup_cache_dynamic = " " ; // path of dynamic ngram cache file for lookup decoding
116
115
std::string logits_file = " " ; // file for saving *all* logits
116
+ std::string rpc_servers = " " ; // comma separated list of RPC servers
117
117
118
+ std::vector<std::string> in_files; // all input files
118
119
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
119
120
std::vector<llama_model_kv_override> kv_overrides;
120
121
@@ -124,23 +125,24 @@ struct gpt_params {
124
125
125
126
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
126
127
128
+ int32_t verbosity = 0 ;
127
129
int32_t control_vector_layer_start = -1 ; // layer range for control vector
128
130
int32_t control_vector_layer_end = -1 ; // layer range for control vector
129
131
130
- int32_t ppl_stride = 0 ; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
131
- int32_t ppl_output_type = 0 ; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
132
- // (which is more convenient to use for plotting)
133
- //
134
- bool hellaswag = false ; // compute HellaSwag score over random tasks from datafile supplied in prompt
135
- size_t hellaswag_tasks = 400 ; // number of tasks to use when computing the HellaSwag score
132
+ int32_t ppl_stride = 0 ; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
133
+ int32_t ppl_output_type = 0 ; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
134
+ // (which is more convenient to use for plotting)
135
+ //
136
+ bool hellaswag = false ; // compute HellaSwag score over random tasks from datafile supplied in prompt
137
+ size_t hellaswag_tasks = 400 ; // number of tasks to use when computing the HellaSwag score
136
138
137
- bool winogrande = false ; // compute Winogrande score over random tasks from datafile supplied in prompt
138
- size_t winogrande_tasks= 0 ; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
139
+ bool winogrande = false ; // compute Winogrande score over random tasks from datafile supplied in prompt
140
+ size_t winogrande_tasks = 0 ; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
139
141
140
- bool multiple_choice = false ; // compute TruthfulQA score over random tasks from datafile supplied in prompt
141
- size_t multiple_choice_tasks = 0 ; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
142
+ bool multiple_choice = false ; // compute TruthfulQA score over random tasks from datafile supplied in prompt
143
+ size_t multiple_choice_tasks = 0 ; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
142
144
143
- bool kl_divergence = false ; // compute KL divergence
145
+ bool kl_divergence = false ; // compute KL divergence
144
146
145
147
bool usage = false ; // print usage
146
148
bool use_color = false ; // use color to distinguish generations and inputs
@@ -163,7 +165,6 @@ struct gpt_params {
163
165
bool logits_all = false ; // return logits for all tokens in the batch
164
166
bool use_mmap = true ; // use mmap for faster loads
165
167
bool use_mlock = false ; // use mlock to keep model in memory
166
- bool verbose = false ;
167
168
bool verbose_prompt = false ; // print prompt tokens before generation
168
169
bool display_prompt = true ; // print prompt before generation
169
170
bool infill = false ; // use infill mode
@@ -180,10 +181,10 @@ struct gpt_params {
180
181
std::vector<std::string> image; // path to image file(s)
181
182
182
183
// server params
183
- int32_t port = 8080 ;
184
- int32_t timeout_read = 600 ;
185
- int32_t timeout_write = timeout_read;
186
- int32_t n_threads_http = -1 ;
184
+ int32_t port = 8080 ; // server listens on this network port
185
+ int32_t timeout_read = 600 ; // http read timeout in seconds
186
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
187
+ int32_t n_threads_http = -1 ; // number of threads to use for http server (-1 = use n_threads)
187
188
188
189
std::string hostname = " 127.0.0.1" ;
189
190
std::string public_path = " " ;
@@ -219,6 +220,16 @@ struct gpt_params {
219
220
// passkey params
220
221
int32_t n_junk = 250 ; // number of times to repeat the junk text
221
222
int32_t i_pos = -1 ; // position of the passkey in the junk text
223
+
224
+ // imatrix params
225
+ std::string out_file = " imatrix.dat" ; // save the resulting imatrix to this file
226
+
227
+ int32_t n_out_freq = 10 ; // output the imatrix every n_out_freq iterations
228
+ int32_t n_save_freq = 0 ; // save the imatrix every n_save_freq iterations
229
+ int32_t i_chunk = 0 ; // start processing from this chunk
230
+
231
+ bool process_output = false ; // collect data for the output tensor
232
+ bool compute_ppl = true ; // whether to compute perplexity
222
233
};
223
234
224
235
void gpt_params_handle_model_default (gpt_params & params);
0 commit comments