Skip to content

Commit f83351f

Browse files
authored
imatrix : migrate to gpt_params (#7771)
* imatrix : migrate to gpt_params ggml-ci * imatrix : add --save-frequency cli arg * common : fix --no-ppl
1 parent ad675e1 commit f83351f

File tree

5 files changed

+212
-214
lines changed

5 files changed

+212
-214
lines changed

common/common.cpp

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
273273
}
274274
} catch (const std::invalid_argument & ex) {
275275
fprintf(stderr, "%s\n", ex.what());
276+
params = params_org;
276277
return false;
277278
}
278279

@@ -408,6 +409,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
408409
}
409410
return true;
410411
}
412+
if (arg == "--in-file") {
413+
if (++i >= argc) {
414+
invalid_param = true;
415+
return true;
416+
}
417+
std::ifstream file(argv[i]);
418+
if (!file) {
419+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
420+
invalid_param = true;
421+
return true;
422+
}
423+
params.in_files.push_back(argv[i]);
424+
return true;
425+
}
411426
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
412427
if (++i >= argc) {
413428
invalid_param = true;
@@ -1081,7 +1096,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10811096
return true;
10821097
}
10831098
if (arg == "-v" || arg == "--verbose") {
1084-
params.verbose = true;
1099+
params.verbosity = 1;
1100+
return true;
1101+
}
1102+
if (arg == "--verbosity") {
1103+
if (++i >= argc) {
1104+
invalid_param = true;
1105+
return true;
1106+
}
1107+
params.verbosity = std::stoi(argv[i]);
10851108
return true;
10861109
}
10871110
if (arg == "--verbose-prompt") {
@@ -1537,6 +1560,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
15371560
params.i_pos = std::stoi(argv[i]);
15381561
return true;
15391562
}
1563+
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
1564+
if (++i >= argc) {
1565+
invalid_param = true;
1566+
return true;
1567+
}
1568+
params.out_file = argv[i];
1569+
return true;
1570+
}
1571+
if (arg == "-ofreq" || arg == "--output-frequency") {
1572+
if (++i >= argc) {
1573+
invalid_param = true;
1574+
return true;
1575+
}
1576+
params.n_out_freq = std::stoi(argv[i]);
1577+
return true;
1578+
}
1579+
if (arg == "--save-frequency") {
1580+
if (++i >= argc) {
1581+
invalid_param = true;
1582+
return true;
1583+
}
1584+
params.n_save_freq = std::stoi(argv[i]);
1585+
return true;
1586+
}
1587+
if (arg == "--process-output") {
1588+
params.process_output = true;
1589+
return true;
1590+
}
1591+
if (arg == "--no-ppl") {
1592+
params.compute_ppl = false;
1593+
return true;
1594+
}
1595+
if (arg == "--chunk" || arg == "--from-chunk") {
1596+
if (++i >= argc) {
1597+
invalid_param = true;
1598+
return true;
1599+
}
1600+
params.i_chunk = std::stoi(argv[i]);
1601+
return true;
1602+
}
15401603
#ifndef LOG_DISABLE_LOGS
15411604
// Parse args for logging parameters
15421605
if (log_param_single_parse(argv[i])) {
@@ -1612,6 +1675,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16121675
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
16131676
options.push_back({ "*", " --version", "show version and build info" });
16141677
options.push_back({ "*", "-v, --verbose", "print verbose information" });
1678+
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
16151679
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
16161680
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
16171681
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
@@ -1637,6 +1701,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16371701
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
16381702
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
16391703
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
1704+
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
16401705
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
16411706
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
16421707
options.push_back({ "*", " --no-escape", "do not process escape sequences" });
@@ -1804,6 +1869,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
18041869
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
18051870
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
18061871

1872+
options.push_back({ "imatrix" });
1873+
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
1874+
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
1875+
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
1876+
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
1877+
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
1878+
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
1879+
18071880
options.push_back({ "bench" });
18081881
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
18091882
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });

common/common.h

Lines changed: 55 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -56,43 +56,42 @@ struct gpt_params {
5656
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
5757

5858
int32_t n_threads = cpu_get_num_math();
59-
int32_t n_threads_draft = -1;
60-
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
61-
int32_t n_threads_batch_draft = -1;
62-
int32_t n_predict = -1; // new tokens to predict
63-
int32_t n_ctx = 0; // context size
64-
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
65-
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
66-
int32_t n_keep = 0; // number of tokens to keep from initial prompt
67-
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
68-
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
69-
int32_t n_parallel = 1; // number of parallel sequences to decode
70-
int32_t n_sequences = 1; // number of sequences to decode
71-
float p_split = 0.1f; // speculative decoding split probability
72-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
73-
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
74-
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
75-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
76-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
77-
int32_t n_beams = 0; // if non-zero then use beam search of given width.
78-
int32_t grp_attn_n = 1; // group-attention factor
79-
int32_t grp_attn_w = 512; // group-attention width
80-
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
81-
float rope_freq_base = 0.0f; // RoPE base frequency
82-
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
59+
int32_t n_threads_draft = -1;
60+
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
61+
int32_t n_threads_batch_draft = -1;
62+
int32_t n_predict = -1; // new tokens to predict
63+
int32_t n_ctx = 0; // context size
64+
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
65+
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
66+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
67+
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
68+
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
69+
int32_t n_parallel = 1; // number of parallel sequences to decode
70+
int32_t n_sequences = 1; // number of sequences to decode
71+
float p_split = 0.1f; // speculative decoding split probability
72+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
73+
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
74+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
75+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
76+
int32_t n_beams = 0; // if non-zero then use beam search of given width.
77+
int32_t grp_attn_n = 1; // group-attention factor
78+
int32_t grp_attn_w = 512; // group-attention width
79+
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
80+
float rope_freq_base = 0.0f; // RoPE base frequency
81+
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
8382
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
84-
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
83+
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
8584
float yarn_beta_fast = 32.0f; // YaRN low correction dim
86-
float yarn_beta_slow = 1.0f; // YaRN high correction dim
87-
int32_t yarn_orig_ctx = 0; // YaRN original context length
85+
float yarn_beta_slow = 1.0f; // YaRN high correction dim
86+
int32_t yarn_orig_ctx = 0; // YaRN original context length
8887
float defrag_thold = -1.0f; // KV cache defragmentation threshold
89-
std::string rpc_servers = ""; // comma separated list of RPC servers
9088

9189
ggml_backend_sched_eval_callback cb_eval = nullptr;
9290
void * cb_eval_user_data = nullptr;
9391

9492
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
9593

94+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
9695
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
9796
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
9897

@@ -114,7 +113,9 @@ struct gpt_params {
114113
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
115114
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
116115
std::string logits_file = ""; // file for saving *all* logits
116+
std::string rpc_servers = ""; // comma separated list of RPC servers
117117

118+
std::vector<std::string> in_files; // all input files
118119
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
119120
std::vector<llama_model_kv_override> kv_overrides;
120121

@@ -124,23 +125,24 @@ struct gpt_params {
124125

125126
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
126127

128+
int32_t verbosity = 0;
127129
int32_t control_vector_layer_start = -1; // layer range for control vector
128130
int32_t control_vector_layer_end = -1; // layer range for control vector
129131

130-
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
131-
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
132-
// (which is more convenient to use for plotting)
133-
//
134-
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
135-
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
132+
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
133+
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
134+
// (which is more convenient to use for plotting)
135+
//
136+
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
137+
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
136138

137-
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
138-
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
139+
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
140+
size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
139141

140-
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
141-
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
142+
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
143+
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
142144

143-
bool kl_divergence = false; // compute KL divergence
145+
bool kl_divergence = false; // compute KL divergence
144146

145147
bool usage = false; // print usage
146148
bool use_color = false; // use color to distinguish generations and inputs
@@ -163,7 +165,6 @@ struct gpt_params {
163165
bool logits_all = false; // return logits for all tokens in the batch
164166
bool use_mmap = true; // use mmap for faster loads
165167
bool use_mlock = false; // use mlock to keep model in memory
166-
bool verbose = false;
167168
bool verbose_prompt = false; // print prompt tokens before generation
168169
bool display_prompt = true; // print prompt before generation
169170
bool infill = false; // use infill mode
@@ -180,10 +181,10 @@ struct gpt_params {
180181
std::vector<std::string> image; // path to image file(s)
181182

182183
// server params
183-
int32_t port = 8080;
184-
int32_t timeout_read = 600;
185-
int32_t timeout_write = timeout_read;
186-
int32_t n_threads_http = -1;
184+
int32_t port = 8080; // server listens on this network port
185+
int32_t timeout_read = 600; // http read timeout in seconds
186+
int32_t timeout_write = timeout_read; // http write timeout in seconds
187+
int32_t n_threads_http = -1; // number of threads to use for http server (-1 = use n_threads)
187188

188189
std::string hostname = "127.0.0.1";
189190
std::string public_path = "";
@@ -219,6 +220,16 @@ struct gpt_params {
219220
// passkey params
220221
int32_t n_junk = 250; // number of times to repeat the junk text
221222
int32_t i_pos = -1; // position of the passkey in the junk text
223+
224+
// imatrix params
225+
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
226+
227+
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
228+
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
229+
int32_t i_chunk = 0; // start processing from this chunk
230+
231+
bool process_output = false; // collect data for the output tensor
232+
bool compute_ppl = true; // whether to compute perplexity
222233
};
223234

224235
void gpt_params_handle_model_default(gpt_params & params);

examples/imatrix/README.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
66
## Usage
77

88
```
9-
./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
10-
[-ofreq num_chunks] [-ow <0 or 1>] [other common params]
9+
./imatrix \
10+
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
11+
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
12+
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
1113
```
1214

1315
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
1416
The parameters in square brackets are optional and have the following meaning:
1517
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
1618
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
17-
* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
18-
* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
19+
* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
20+
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
21+
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
1922

2023
For faster computation, make sure to use GPU offloading via the `-ngl` argument
2124

0 commit comments

Comments
 (0)