Skip to content

Commit d51882b

Browse files
unboundedsw
authored andcommitted
quantize-stats command
Command that calculates some statistics over the errors introduced by quantization, at the moment mean square error and max error for layer weights. Should be useful for testing quantization improvements. Needs some internal state from ggml and llama that should not be part of the public API.
1 parent 53dbba7 commit d51882b

File tree

9 files changed

+382
-11
lines changed

9 files changed

+382
-11
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ models/*
1919

2020
/main
2121
/quantize
22+
/quantize-stats
2223
/result
2324
/perplexity
2425
/embedding

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ common.o: examples/common.cpp examples/common.h
148148
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
149149

150150
clean:
151-
rm -vf *.o main quantize perplexity embedding
151+
rm -vf *.o main quantize quantize-stats perplexity embedding
152152

153153
main: examples/main/main.cpp ggml.o llama.o common.o
154154
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@@ -159,6 +159,9 @@ main: examples/main/main.cpp ggml.o llama.o common.o
159159
quantize: examples/quantize/quantize.cpp ggml.o llama.o
160160
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
161161

162+
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
163+
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
164+
162165
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
163166
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
164167

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ if (EMSCRIPTEN)
3131
else()
3232
add_subdirectory(main)
3333
add_subdirectory(quantize)
34+
add_subdirectory(quantize-stats)
3435
add_subdirectory(perplexity)
3536
add_subdirectory(embedding)
3637
endif()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
set(TARGET quantize-stats)
2+
add_executable(${TARGET} quantize-stats.cpp)
3+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
#include "ggml.h"
2+
#include "ggml_internal.h"
3+
#include "llama.h"
4+
#include "llama_internal.h"
5+
6+
#include <algorithm>
7+
#include <cassert>
8+
#include <cinttypes>
9+
#include <cmath>
10+
#include <cstdio>
11+
#include <cstring>
12+
#include <map>
13+
#include <string>
14+
#include <unordered_map>
15+
#include <vector>
16+
17+
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
18+
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
19+
20+
struct quantize_stats_params {
21+
std::string model = "models/7B/ggml-model-f16.bin";
22+
bool verbose = false;
23+
bool per_layer_stats = false;
24+
bool print_histogram = false;
25+
std::vector<std::string> include_layers;
26+
std::vector<std::string> exclude_layers;
27+
std::vector<enum ggml_type> include_types;
28+
};
29+
30+
const size_t HISTOGRAM_BUCKETS = 30;
31+
const double HISTOGRAM_RANGE = 0.03;
32+
33+
struct error_stats {
34+
size_t num_samples;
35+
double total_error;
36+
double max_error;
37+
uint64_t error_histogram[HISTOGRAM_BUCKETS];
38+
};
39+
40+
41+
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
42+
quantize_stats_params params;
43+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
44+
fprintf(stderr, "\n");
45+
fprintf(stderr, "options:\n");
46+
fprintf(stderr, " -h, --help show this help message and exit\n");
47+
fprintf(stderr, " -m FNAME, --model FNAME\n");
48+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
49+
fprintf(stderr, " -v, --verbose\n");
50+
fprintf(stderr, " verbose output (default: false)\n");
51+
fprintf(stderr, " -p, --per-layer-stats\n");
52+
fprintf(stderr, " print stats per layer (default: false)\n");
53+
fprintf(stderr, " --histogram\n");
54+
fprintf(stderr, " print error histogram (default: false)\n");
55+
fprintf(stderr, " -l LAYER, --include-layer LAYER\n");
56+
fprintf(stderr, " only test layers containing substring\n");
57+
fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n");
58+
fprintf(stderr, " exclude layers containing substring\n");
59+
fprintf(stderr, " -t TYPE, --type TYPE\n");
60+
fprintf(stderr, " only test given type (q4_0, q4_1)\n");
61+
fprintf(stderr, "\n");
62+
}
63+
64+
// Check if a layer is included/excluded by command line
65+
bool layer_included(const quantize_stats_params params, const std::string & layer) {
66+
for (const auto& excluded : params.exclude_layers) {
67+
if (layer.find(excluded) != std::string::npos) {
68+
return false;
69+
}
70+
}
71+
for (const auto& included : params.include_layers) {
72+
if (layer.find(included) != std::string::npos) {
73+
return true;
74+
}
75+
}
76+
return params.include_layers.empty();
77+
}
78+
79+
// Update error statistics given vectors with the before/after result of quantization
80+
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
81+
for (int64_t i = 0; i < nelements; i++) {
82+
double diff = input[i] - output[i];
83+
stats.total_error += diff * diff;
84+
stats.max_error = fmax(fabs(diff), stats.max_error);
85+
stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
86+
}
87+
stats.num_samples += nelements;
88+
}
89+
90+
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
91+
printf("%-50s: mse %.8f, maxerr %.8f\n", name.c_str(), stats.total_error / (double) stats.num_samples, stats.max_error);
92+
if (print_histogram) {
93+
printf("Error distribution:\n");
94+
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
95+
double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
96+
double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
97+
if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
98+
printf("[%3.3f, %3.3f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
99+
}
100+
}
101+
}
102+
103+
// copied from ggml.h - verify that we can access this as a flat array
104+
static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
105+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
106+
107+
return
108+
tensor->nb[0] == ggml_type_size(tensor->type) &&
109+
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
110+
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
111+
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
112+
}
113+
114+
// Run quantization function for a single layer and update error stats
115+
void test_roundtrip_on_layer(
116+
std::string & name,
117+
bool print_layer_stats,
118+
const quantize_fns_t & qfns,
119+
const ggml_tensor * layer,
120+
float * input_scratch,
121+
char *quantized_scratch,
122+
float * output_scratch,
123+
error_stats & total_error) {
124+
125+
assert(tensor_is_contiguous(layer));
126+
int64_t nelements = ggml_nelements(layer);
127+
128+
if (layer->type == GGML_TYPE_F16) {
129+
for (int i = 0; i < nelements; i++) {
130+
input_scratch[i] = ggml_get_f32_1d(layer, i);
131+
}
132+
} else {
133+
input_scratch = ggml_get_data_f32(layer);
134+
}
135+
136+
qfns.quantize_row_q(input_scratch, quantized_scratch, nelements);
137+
qfns.dequantize_row_q(quantized_scratch, output_scratch, nelements);
138+
139+
update_error_stats(nelements, input_scratch, output_scratch, total_error);
140+
if (print_layer_stats) {
141+
error_stats layer_error {};
142+
update_error_stats(nelements, input_scratch, output_scratch, layer_error);
143+
print_error_stats(name, layer_error, false);
144+
}
145+
}
146+
147+
int main(int argc, char ** argv) {
148+
ggml_time_init();
149+
150+
quantize_stats_params params;
151+
152+
// read command line
153+
154+
bool invalid_param = false;
155+
std::string arg;
156+
for (int i = 1; i < argc; i++) {
157+
arg = argv[i];
158+
159+
if (arg == "-h" || arg == "--help") {
160+
quantize_stats_print_usage(argc, argv);
161+
exit(0);
162+
} else if (arg == "-v") {
163+
params.verbose = true;
164+
} else if (arg == "-p" || arg == "--per-layer-stats") {
165+
params.per_layer_stats = true;
166+
} else if (arg == "--histogram") {
167+
params.print_histogram = true;
168+
} else if (arg == "-m" || arg == "--model") {
169+
if (++i >= argc) {
170+
invalid_param = true;
171+
break;
172+
}
173+
params.model = argv[i];
174+
} else if (arg == "-l" || arg == "--include-layer") {
175+
if (++i >= argc) {
176+
invalid_param = true;
177+
break;
178+
}
179+
params.include_layers.push_back(argv[i]);
180+
} else if (arg == "-L" || arg == "--exclude-layer") {
181+
if (++i >= argc) {
182+
invalid_param = true;
183+
break;
184+
}
185+
params.exclude_layers.push_back(argv[i]);
186+
} else if (arg == "-t" || arg == "--type") {
187+
if (++i >= argc) {
188+
invalid_param = true;
189+
break;
190+
}
191+
int j;
192+
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
193+
// find match
194+
}
195+
if (j < GGML_TYPE_COUNT) {
196+
params.include_types.push_back((ggml_type) j);
197+
} else {
198+
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
199+
invalid_param = true;
200+
}
201+
} else {
202+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
203+
quantize_stats_print_usage(argc, argv);
204+
return 1;
205+
}
206+
}
207+
if (invalid_param) {
208+
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
209+
quantize_stats_print_usage(argc, argv);
210+
return 1;
211+
}
212+
213+
// load the model
214+
fprintf(stderr, "Loading model\n");
215+
216+
const int64_t t_main_start_us = ggml_time_us();
217+
llama_context * ctx;
218+
219+
{
220+
auto lparams = llama_context_default_params();
221+
222+
lparams.n_ctx = 256;
223+
lparams.n_parts = 1;
224+
lparams.seed = 1;
225+
lparams.f16_kv = false;
226+
lparams.use_mlock = false;
227+
228+
ctx = llama_init_from_file(params.model.c_str(), lparams);
229+
230+
if (ctx == NULL) {
231+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
232+
return 1;
233+
}
234+
}
235+
236+
// Sort tensors for consistent output
237+
const auto tensors = llama_internal_get_tensor_map(ctx);
238+
std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() };
239+
240+
// check layer tensors
241+
int included_layers = 0;
242+
int64_t max_nelements = 0;
243+
bool is_f16 = false;
244+
for (const auto& kv_tensor : tensors_sorted) {
245+
if (!layer_included(params, kv_tensor.first)) {
246+
continue;
247+
}
248+
if (params.verbose) {
249+
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
250+
}
251+
if (kv_tensor.second->type == GGML_TYPE_F16) {
252+
is_f16 = true;
253+
} else if (kv_tensor.second->type != GGML_TYPE_F32) {
254+
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
255+
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
256+
llama_free(ctx);
257+
return 1;
258+
}
259+
included_layers++;
260+
max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
261+
}
262+
263+
if (is_f16) {
264+
printf("note: source model is f16\n");
265+
}
266+
printf("testing %d layers with max size %" PRId64 ", allocating %" PRId64 " bytes\n", included_layers, max_nelements, 3*4*max_nelements);
267+
// allocate scratch space
268+
std::vector<float> input_scratch(max_nelements);
269+
std::vector<char> quantized_scratch(max_nelements*4);
270+
std::vector<float> output_scratch(max_nelements);
271+
272+
// loop throught quantization types
273+
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
274+
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
275+
continue;
276+
}
277+
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
278+
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
279+
if (params.verbose) {
280+
printf("testing %s ...\n", type_strs[i]);
281+
}
282+
283+
error_stats global_stats {};
284+
285+
for (const auto& kv_tensor : tensors_sorted) {
286+
if (!layer_included(params, kv_tensor.first)) {
287+
continue;
288+
}
289+
if (params.verbose) {
290+
printf(" %s ...\n", kv_tensor.first.c_str());
291+
}
292+
std::string layer_name { type_strs[i] };
293+
layer_name += "::" + kv_tensor.first;
294+
test_roundtrip_on_layer(
295+
layer_name,
296+
params.per_layer_stats,
297+
qfns,
298+
kv_tensor.second,
299+
input_scratch.data(),
300+
quantized_scratch.data(),
301+
output_scratch.data(),
302+
global_stats
303+
);
304+
}
305+
306+
print_error_stats(type_strs[i], global_stats, params.print_histogram);
307+
}
308+
}
309+
310+
311+
llama_free(ctx);
312+
// report timing
313+
{
314+
const int64_t t_main_end_us = ggml_time_us();
315+
316+
printf("\n");
317+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
318+
}
319+
320+
return 0;
321+
}

ggml.c

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define _GNU_SOURCE
33

44
#include "ggml.h"
5+
#include "ggml_internal.h"
56

67
#if defined(_MSC_VER) || defined(__MINGW32__)
78
#include <malloc.h> // using malloc.h with MSC/MINGW
@@ -6525,16 +6526,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
65256526
//}
65266527
}
65276528

6528-
typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k);
6529-
typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k);
6530-
typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y);
6531-
6532-
typedef struct {
6533-
dequantize_row_q_t dequantize_row_q;
6534-
quantize_row_q_t quantize_row_q;
6535-
vec_dot_q_t vec_dot_q;
6536-
} quantize_fns_t;
6537-
65386529
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
65396530
[GGML_TYPE_Q4_0] = {
65406531
.dequantize_row_q = dequantize_row_q4_0,
@@ -6548,6 +6539,12 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
65486539
},
65496540
};
65506541

6542+
// For internal test use
6543+
quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
6544+
GGML_ASSERT(i < GGML_TYPE_COUNT);
6545+
return quantize_fns[i];
6546+
}
6547+
65516548
static void ggml_compute_forward_mul_mat_q_f32(
65526549
const struct ggml_compute_params * params,
65536550
const struct ggml_tensor * src0,

0 commit comments

Comments
 (0)