Skip to content

Commit 40ebf81

Browse files
committed
Q4_0 scale selection using RMSE
1 parent 62cfc54 commit 40ebf81

File tree

9 files changed

+304
-96
lines changed

9 files changed

+304
-96
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ $(info I CC: $(CCV))
133133
$(info I CXX: $(CXXV))
134134
$(info )
135135

136-
default: main quantize perplexity embedding
136+
default: main quantize quantize-stats perplexity embedding
137137

138138
#
139139
# Build library

SHA256SUMS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
2+
0cc0b0a3dc8cd29f005946f8364ac2bbce797e792a40c0fb4114615e4f825976 models/7B/ggml-model-f16.bin
3+
5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3 models/7B/ggml-model-q4_0.bin
24
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
35
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
46
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
7+
7da75a2a164a8fb4cfbdd4823111f3545c690c5d75c345a2419a9f1e2d24080f models/13B/ggml-model-f16.bin
8+
4c5a285985bac6b8dcc56a97752b8ab70687ce0584daa6bb418ee458d91126e8 models/13B/ggml-model-q4_0.bin
59
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
610
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
711
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth

examples/quantize-stats/quantize-stats.cpp

Lines changed: 61 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@
1717
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
1818
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
1919

20+
static const char * impl_strs[] = { "simd", "reference", "rmse" };
21+
static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list");
22+
2023
struct quantize_stats_params {
2124
std::string model = "models/7B/ggml-model-f16.bin";
2225
bool verbose = false;
2326
bool per_layer_stats = false;
2427
bool print_histogram = false;
25-
bool reference = false;
28+
std::vector<ggml_quantize_impl_t> include_impl;
2629
std::vector<std::string> include_layers;
2730
std::vector<std::string> exclude_layers;
2831
std::vector<enum ggml_type> include_types;
@@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
4851
fprintf(stderr, " -h, --help show this help message and exit\n");
4952
fprintf(stderr, " -m FNAME, --model FNAME\n");
5053
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
51-
fprintf(stderr, " -r, --reference\n");
52-
fprintf(stderr, " use reference implementation (default: false)\n");
54+
fprintf(stderr, " -i, --implementation\n");
55+
fprintf(stderr, " select implementation (simd, reference, rmse)\n");
5356
fprintf(stderr, " -v, --verbose\n");
5457
fprintf(stderr, " verbose output (default: false)\n");
5558
fprintf(stderr, " -p, --per-layer-stats\n");
@@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) {
104107
return INFINITY;
105108
}
106109

107-
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
110+
void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) {
108111
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
109112
double median = find_quantile(stats, .5);
110113
double pct95 = find_quantile(stats, .95);
111-
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
114+
printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n",
115+
name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median);
112116
if (print_histogram) {
113117
printf("Error distribution:\n");
114118
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
@@ -136,7 +140,7 @@ void test_roundtrip_on_layer(
136140
std::string & name,
137141
bool print_layer_stats,
138142
const quantize_fns_t & qfns,
139-
bool use_reference,
143+
ggml_quantize_impl_t impl,
140144
const ggml_tensor * layer,
141145
float * input_scratch,
142146
char *quantized_scratch,
@@ -158,11 +162,7 @@ void test_roundtrip_on_layer(
158162
input_scratch = ggml_get_data_f32(layer) + offset;
159163
}
160164

161-
if (use_reference) {
162-
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
163-
} else {
164-
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
165-
}
165+
qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size);
166166
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
167167

168168
update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
@@ -171,7 +171,7 @@ void test_roundtrip_on_layer(
171171
}
172172
}
173173
if (print_layer_stats) {
174-
print_error_stats(name, layer_error, false);
174+
print_error_stats(name, impl, layer_error, false);
175175
}
176176
}
177177

@@ -190,8 +190,21 @@ int main(int argc, char ** argv) {
190190
if (arg == "-h" || arg == "--help") {
191191
quantize_stats_print_usage(argc, argv);
192192
exit(0);
193-
} else if (arg == "-r" || arg == "--reference") {
194-
params.reference = true;
193+
} else if (arg == "-i" || arg == "--implementation") {
194+
if (++i >= argc) {
195+
invalid_param = true;
196+
break;
197+
}
198+
int j;
199+
for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) {
200+
// find match
201+
}
202+
if (j < GGML_QUANTIZE_IMPL_COUNT) {
203+
params.include_impl.push_back((ggml_quantize_impl_t)j);
204+
} else {
205+
fprintf(stderr, "error: %s not in list of implementations\n", argv[i]);
206+
invalid_param = true;
207+
}
195208
} else if (arg == "-v") {
196209
params.verbose = true;
197210
} else if (arg == "-p" || arg == "--per-layer-stats") {
@@ -302,42 +315,48 @@ int main(int argc, char ** argv) {
302315
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
303316
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
304317

305-
// loop throught quantization types
306-
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
307-
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
318+
// loop through quantization types
319+
for (int type = 0; type < GGML_TYPE_COUNT; type++) {
320+
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) {
308321
continue;
309322
}
310-
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
323+
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
311324
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
312-
if (params.verbose) {
313-
printf("testing %s ...\n", type_strs[i]);
314-
}
315-
316-
error_stats global_stats {};
317-
318-
for (const auto& kv_tensor : tensors_sorted) {
319-
if (!layer_included(params, kv_tensor.first)) {
325+
for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) {
326+
if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) {
320327
continue;
321328
}
329+
322330
if (params.verbose) {
323-
printf(" %s ...\n", kv_tensor.first.c_str());
331+
printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]);
324332
}
325-
std::string layer_name { type_strs[i] };
326-
layer_name += "::" + kv_tensor.first;
327-
test_roundtrip_on_layer(
328-
layer_name,
329-
params.per_layer_stats,
330-
qfns,
331-
params.reference,
332-
kv_tensor.second,
333-
input_scratch.data(),
334-
quantized_scratch.data(),
335-
output_scratch.data(),
336-
global_stats
337-
);
338-
}
339333

340-
print_error_stats(type_strs[i], global_stats, params.print_histogram);
334+
error_stats global_stats {};
335+
336+
for (const auto& kv_tensor : tensors_sorted) {
337+
if (!layer_included(params, kv_tensor.first)) {
338+
continue;
339+
}
340+
if (params.verbose) {
341+
printf(" %s ...\n", kv_tensor.first.c_str());
342+
}
343+
std::string layer_name { type_strs[type] };
344+
layer_name += "::" + kv_tensor.first;
345+
test_roundtrip_on_layer(
346+
layer_name,
347+
params.per_layer_stats,
348+
qfns,
349+
(ggml_quantize_impl_t)impl,
350+
kv_tensor.second,
351+
input_scratch.data(),
352+
quantized_scratch.data(),
353+
output_scratch.data(),
354+
global_stats
355+
);
356+
}
357+
358+
print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram);
359+
}
341360
}
342361
}
343362

examples/quantize/scale.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import matplotlib.pyplot as plt
2+
3+
# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
4+
# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
5+
data = (
6+
(-10.0, 0),
7+
(-9.9, 1),
8+
(-9.8, 3),
9+
(-9.7, 65),
10+
(-9.6, 738),
11+
(-9.5, 5779),
12+
(-9.4, 30880),
13+
(-9.3, 121078),
14+
(-9.2, 375674),
15+
(-9.1, 941350),
16+
(-9.0, 1990278),
17+
(-8.9, 3635317),
18+
(-8.8, 5891752),
19+
(-8.7, 8678748),
20+
(-8.6, 11771759),
21+
(-8.5, 14873993),
22+
(-8.4, 17594260),
23+
(-8.3, 19553100),
24+
(-8.2, 20415428),
25+
(-8.1, 20017134),
26+
(-8.0, 18357204),
27+
(-7.9, 15597612),
28+
(-7.8, 11993688),
29+
(-7.7, 7842970),
30+
(-7.6, 2880878),
31+
(-7.5, 3478),
32+
(-7.4, 2648437),
33+
(-7.3, 5641970),
34+
(-7.2, 5935890),
35+
(-7.1, 4910790),
36+
(-7.0, 3425891),
37+
(-6.9, 2068250),
38+
(-6.8, 1089883),
39+
(-6.7, 502462),
40+
(-6.6, 156356),
41+
(-6.5, 205),
42+
(-6.4, 163500),
43+
(-6.3, 386291),
44+
(-6.2, 423018),
45+
(-6.1, 319360),
46+
(-6.0, 180783),
47+
(-5.9, 78822),
48+
(-5.8, 28254),
49+
(-5.7, 8698),
50+
(-5.6, 1969),
51+
(-5.5, 0),
52+
(-5.4, 2069),
53+
(-5.3, 5722),
54+
(-5.2, 7107),
55+
(-5.1, 5113),
56+
(-5.0, 2332),
57+
(-4.9, 636),
58+
(-4.8, 130),
59+
(-4.7, 12),
60+
(-4.6, 1),
61+
(-4.5, 0),
62+
(-4.4, 3),
63+
(-4.3, 4),
64+
(-4.2, 8),
65+
(-4.1, 8),
66+
(-4.0, 27),
67+
)
68+
x, y = zip(*data)
69+
70+
fig, ax = plt.subplots()
71+
b = ax.bar(x, y, 0.1, bottom=1)
72+
ax.set_yscale("log")
73+
ax.set_xlabel("scale")
74+
ax.set_ylabel("N")
75+
plt.title("Quantization scale factor with lowest RMS error")
76+
plt.show()

0 commit comments

Comments
 (0)