17
17
static const char * type_strs[] = { " q4_0" , " q4_1" , " i8" , " i16" , " i32" , " f16" , " f32" };
18
18
static_assert (sizeof (type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
19
19
20
+ static const char * impl_strs[] = { " simd" , " reference" , " rmse" };
21
+ static_assert (sizeof (impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list");
22
+
20
23
struct quantize_stats_params {
21
24
std::string model = " models/7B/ggml-model-f16.bin" ;
22
25
bool verbose = false ;
23
26
bool per_layer_stats = false ;
24
27
bool print_histogram = false ;
25
- bool reference = false ;
28
+ std::vector< ggml_quantize_impl_t > include_impl ;
26
29
std::vector<std::string> include_layers;
27
30
std::vector<std::string> exclude_layers;
28
31
std::vector<enum ggml_type> include_types;
@@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
48
51
fprintf (stderr, " -h, --help show this help message and exit\n " );
49
52
fprintf (stderr, " -m FNAME, --model FNAME\n " );
50
53
fprintf (stderr, " model path (default: %s)\n " , params.model .c_str ());
51
- fprintf (stderr, " -r , --reference \n " );
52
- fprintf (stderr, " use reference implementation (default: false )\n " );
54
+ fprintf (stderr, " -i , --implementation \n " );
55
+ fprintf (stderr, " select implementation (simd, reference, rmse )\n " );
53
56
fprintf (stderr, " -v, --verbose\n " );
54
57
fprintf (stderr, " verbose output (default: false)\n " );
55
58
fprintf (stderr, " -p, --per-layer-stats\n " );
@@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) {
104
107
return INFINITY;
105
108
}
106
109
107
- void print_error_stats (const std::string & name, const error_stats & stats, bool print_histogram) {
110
+ void print_error_stats (const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) {
108
111
double rmse = sqrt (stats.total_error / (double ) stats.num_samples );
109
112
double median = find_quantile (stats, .5 );
110
113
double pct95 = find_quantile (stats, .95 );
111
- printf (" %-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n " , name.c_str (), rmse, stats.max_error , pct95, median);
114
+ printf (" %-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n " ,
115
+ name.c_str (), impl_strs[impl], rmse, stats.max_error , pct95, median);
112
116
if (print_histogram) {
113
117
printf (" Error distribution:\n " );
114
118
for (size_t i = 0 ; i < HISTOGRAM_BUCKETS; i++) {
@@ -136,7 +140,7 @@ void test_roundtrip_on_layer(
136
140
std::string & name,
137
141
bool print_layer_stats,
138
142
const quantize_fns_t & qfns,
139
- bool use_reference ,
143
+ ggml_quantize_impl_t impl ,
140
144
const ggml_tensor * layer,
141
145
float * input_scratch,
142
146
char *quantized_scratch,
@@ -158,11 +162,7 @@ void test_roundtrip_on_layer(
158
162
input_scratch = ggml_get_data_f32 (layer) + offset;
159
163
}
160
164
161
- if (use_reference) {
162
- qfns.quantize_row_q_reference (input_scratch, quantized_scratch, chunk_size);
163
- } else {
164
- qfns.quantize_row_q (input_scratch, quantized_scratch, chunk_size);
165
- }
165
+ qfns.quantize_row_q [impl](input_scratch, quantized_scratch, chunk_size);
166
166
qfns.dequantize_row_q (quantized_scratch, output_scratch, chunk_size);
167
167
168
168
update_error_stats (chunk_size, input_scratch, output_scratch, total_error);
@@ -171,7 +171,7 @@ void test_roundtrip_on_layer(
171
171
}
172
172
}
173
173
if (print_layer_stats) {
174
- print_error_stats (name, layer_error, false );
174
+ print_error_stats (name, impl, layer_error, false );
175
175
}
176
176
}
177
177
@@ -190,8 +190,21 @@ int main(int argc, char ** argv) {
190
190
if (arg == " -h" || arg == " --help" ) {
191
191
quantize_stats_print_usage (argc, argv);
192
192
exit (0 );
193
- } else if (arg == " -r" || arg == " --reference" ) {
194
- params.reference = true ;
193
+ } else if (arg == " -i" || arg == " --implementation" ) {
194
+ if (++i >= argc) {
195
+ invalid_param = true ;
196
+ break ;
197
+ }
198
+ int j;
199
+ for (j = 0 ; j < GGML_QUANTIZE_IMPL_COUNT && strcmp (argv[i], impl_strs[j]) != 0 ; j++) {
200
+ // find match
201
+ }
202
+ if (j < GGML_QUANTIZE_IMPL_COUNT) {
203
+ params.include_impl .push_back ((ggml_quantize_impl_t )j);
204
+ } else {
205
+ fprintf (stderr, " error: %s not in list of implementations\n " , argv[i]);
206
+ invalid_param = true ;
207
+ }
195
208
} else if (arg == " -v" ) {
196
209
params.verbose = true ;
197
210
} else if (arg == " -p" || arg == " --per-layer-stats" ) {
@@ -302,42 +315,48 @@ int main(int argc, char ** argv) {
302
315
std::vector<char > quantized_scratch (SCRATCH_ELEMENTS*4 );
303
316
std::vector<float > output_scratch (SCRATCH_ELEMENTS);
304
317
305
- // loop throught quantization types
306
- for (int i = 0 ; i < GGML_TYPE_COUNT; i ++) {
307
- if (!params.include_types .empty () && std::find (params.include_types .begin (), params.include_types .end (), i ) == params.include_types .end ()) {
318
+ // loop through quantization types
319
+ for (int type = 0 ; type < GGML_TYPE_COUNT; type ++) {
320
+ if (!params.include_types .empty () && std::find (params.include_types .begin (), params.include_types .end (), type ) == params.include_types .end ()) {
308
321
continue ;
309
322
}
310
- quantize_fns_t qfns = ggml_internal_get_quantize_fn (i );
323
+ quantize_fns_t qfns = ggml_internal_get_quantize_fn (type );
311
324
if (qfns.quantize_row_q && qfns.dequantize_row_q ) {
312
- if (params.verbose ) {
313
- printf (" testing %s ...\n " , type_strs[i]);
314
- }
315
-
316
- error_stats global_stats {};
317
-
318
- for (const auto & kv_tensor : tensors_sorted) {
319
- if (!layer_included (params, kv_tensor.first )) {
325
+ for (int impl = 0 ; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) {
326
+ if (!params.include_impl .empty () && std::find (params.include_impl .begin (), params.include_impl .end (), impl) == params.include_impl .end ()) {
320
327
continue ;
321
328
}
329
+
322
330
if (params.verbose ) {
323
- printf (" %s ...\n " , kv_tensor. first . c_str () );
331
+ printf (" testing %s %s ...\n " , type_strs[type], impl_strs[impl] );
324
332
}
325
- std::string layer_name { type_strs[i] };
326
- layer_name += " ::" + kv_tensor.first ;
327
- test_roundtrip_on_layer (
328
- layer_name,
329
- params.per_layer_stats ,
330
- qfns,
331
- params.reference ,
332
- kv_tensor.second ,
333
- input_scratch.data (),
334
- quantized_scratch.data (),
335
- output_scratch.data (),
336
- global_stats
337
- );
338
- }
339
333
340
- print_error_stats (type_strs[i], global_stats, params.print_histogram );
334
+ error_stats global_stats {};
335
+
336
+ for (const auto & kv_tensor : tensors_sorted) {
337
+ if (!layer_included (params, kv_tensor.first )) {
338
+ continue ;
339
+ }
340
+ if (params.verbose ) {
341
+ printf (" %s ...\n " , kv_tensor.first .c_str ());
342
+ }
343
+ std::string layer_name { type_strs[type] };
344
+ layer_name += " ::" + kv_tensor.first ;
345
+ test_roundtrip_on_layer (
346
+ layer_name,
347
+ params.per_layer_stats ,
348
+ qfns,
349
+ (ggml_quantize_impl_t )impl,
350
+ kv_tensor.second ,
351
+ input_scratch.data (),
352
+ quantized_scratch.data (),
353
+ output_scratch.data (),
354
+ global_stats
355
+ );
356
+ }
357
+
358
+ print_error_stats (type_strs[type], (ggml_quantize_impl_t )impl, global_stats, params.print_histogram );
359
+ }
341
360
}
342
361
}
343
362
0 commit comments